{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Musicians- Medium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
     ]
    }
   ],
   "source": [
    "import findspark\n",
    "import pandas as pd\n",
    "findspark.init()\n",
    "\n",
    "SVR = '192.168.31.31'\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import *\n",
    "from pyspark.sql import Window\n",
    "\n",
    "sc = (SparkSession.builder.appName('app16-2') \n",
    "      .master(f'spark://{SVR}:7077') \n",
    "      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') \n",
    "      .config('spark.cores.max', '4') \n",
    "      .config('spark.executor.instances', '1') \n",
    "      .config('spark.executor.cores', '2') \n",
    "      .config('spark.executor.memory', '10g') \n",
    "      .enableHiveSupport().getOrCreate())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "band = sc.read.table('sqlzoo.band')\n",
    "composer = sc.read.table('sqlzoo.composer')\n",
    "composition = sc.read.table('sqlzoo.composition')\n",
    "concert = sc.read.table('sqlzoo.concert')\n",
    "has_composed = sc.read.table('sqlzoo.has_composed')\n",
    "musician = sc.read.table('sqlzoo.musician')\n",
    "performance = sc.read.table('sqlzoo.performance')\n",
    "performer = sc.read.table('sqlzoo.performer')\n",
    "place = sc.read.table('sqlzoo.place')\n",
    "plays_in = sc.read.table('sqlzoo.plays_in')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.\n",
    "**List the names, dates of birth and the instrument played of living musicians who play a instrument which Theo also plays.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>m_name</th>\n",
       "      <th>born</th>\n",
       "      <th>instrument</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Harry Forte</td>\n",
       "      <td>1951-02-28</td>\n",
       "      <td>drums</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Harry Forte</td>\n",
       "      <td>1951-02-28</td>\n",
       "      <td>violin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>James First</td>\n",
       "      <td>1965-06-10</td>\n",
       "      <td>violin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Jeff Dawn</td>\n",
       "      <td>1945-12-12</td>\n",
       "      <td>violin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>John Smith</td>\n",
       "      <td>1950-03-03</td>\n",
       "      <td>violin</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        m_name        born instrument\n",
       "0  Harry Forte  1951-02-28      drums\n",
       "1  Harry Forte  1951-02-28     violin\n",
       "2  James First  1965-06-10     violin\n",
       "3    Jeff Dawn  1945-12-12     violin\n",
       "4   John Smith  1950-03-03     violin"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(musician.filter((musician['died'].isNull()) & \n",
    "                 (~musician['m_name'].like('Theo%')))\n",
    " .join(performer, on=(musician['m_no']==col('perf_is')))\n",
    " .join(musician.filter(musician['m_name'].like('Theo%'))\n",
    "        .join(performer, on=(musician['m_no']==performer['perf_is']))\n",
    "        .select('instrument'), on='instrument')\n",
    " .select('m_name', 'born', 'instrument')\n",
    " .orderBy('m_name')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7.\n",
    "**List the name and the number of players for the band whose number of players is greater than the average number of players in each band.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>band_name</th>\n",
       "      <th>n_mbr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ROP</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AASO</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Oh well</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  band_name  n_mbr\n",
       "0       ROP      7\n",
       "1      AASO      7\n",
       "2   Oh well      5"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(band.join(plays_in, on=(band['band_no']==plays_in['band_id']))\n",
    " .join(performer, on=(plays_in['player']==performer['perf_no']))\n",
    " .select('band_name', 'perf_is')\n",
    " .distinct()\n",
    " .groupBy('band_name')\n",
    " .agg(count('perf_is').alias('n_mbr'))\n",
    " .withColumn('avg_mbr', avg('n_mbr').over(Window.partitionBy(lit(0))))\n",
    " .filter(col('n_mbr') > col('avg_mbr'))\n",
    " .select('band_name', 'n_mbr')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8.\n",
    "**List the names of musicians who both conduct and compose and live in Britain.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>m_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Fred Bloggs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jeff Dawn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Phil Hot</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Rose Spring</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Tony Smythe</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        m_name\n",
       "0  Fred Bloggs\n",
       "1    Jeff Dawn\n",
       "2     Phil Hot\n",
       "3  Rose Spring\n",
       "4  Tony Smythe"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(musician.join(composer, on=(musician['m_no']==composer['comp_is']))\n",
    " .join(place.filter(place['place_country'].isin('England', 'Scotland')), \n",
    "       on=(musician['living_in']==place['place_no']))\n",
    " .join(performance, on=(musician['m_no']==performance['conducted_by']))\n",
    " .select('m_name')\n",
    " .distinct()\n",
    " .orderBy('m_name')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9.\n",
    "**Show the least commonly played instrument and the number of musicians who play it.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>instrument</th>\n",
       "      <th>n_musician</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>clarinet</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  instrument  n_musician\n",
       "0   clarinet           1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(performer.join(plays_in, on=(performer['perf_no']==plays_in['player']))\n",
    " .join(performance, on=(plays_in['band_id']==performance['gave']))\n",
    " .groupBy('instrument')\n",
    " .agg(count('perf_no').alias('n_musician'))\n",
    " .orderBy('n_musician')\n",
    " .limit(1)\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10.\n",
    "**List the bands that have played music composed by Sue Little; Give the titles of the composition in each case.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>band_name</th>\n",
       "      <th>c_title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BBSO</td>\n",
       "      <td>Slow Song</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BBSO</td>\n",
       "      <td>Slow Symphony Blowing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Somebody Loves this</td>\n",
       "      <td>Slow Symphony Blowing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Swinging strings</td>\n",
       "      <td>Slow Song</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The left Overs</td>\n",
       "      <td>Slow Song</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             band_name                c_title\n",
       "0                 BBSO              Slow Song\n",
       "1                 BBSO  Slow Symphony Blowing\n",
       "2  Somebody Loves this  Slow Symphony Blowing\n",
       "3     Swinging strings              Slow Song\n",
       "4       The left Overs              Slow Song"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(composition\n",
    " .join(has_composed, on=(composition['c_no']==has_composed['cmpn_no']))\n",
    " .join(composer, on=(has_composed['cmpr_no']==composer['comp_no']))\n",
    " .join(musician.filter(musician['m_name']=='Sue Little'),\n",
    "       on=(composer['comp_is']==musician['m_no']))\n",
    " .select('c_no', 'c_title')\n",
    " .join(performance, on=(col('c_no')==performance['performed']))\n",
    " .join(band, on=(performance['gave']==band['band_no']))\n",
    " .select('band_name', 'c_title')\n",
    " .orderBy('band_name')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
